%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
from tslearn.preprocessing import TimeSeriesScalerMeanVariance as tsmv
from hdbscan import HDBSCAN
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from modules.utils import create_filters
from modules.visualization import curves_visualizer, profiles_visualizer
from modules.visualization import visualize_cluster_overlap
from modules.clustering import rule_based_clustering, auto_k_means
from modules.autoencoders import RecurrentAutoEncoder
def sns_styleset():
sns.set(context='paper', style='whitegrid', font='DejaVu Sans')
matplotlib.rcParams['figure.dpi'] = 300
matplotlib.rcParams['axes.linewidth'] = 1
matplotlib.rcParams['xtick.major.width'] = 1
matplotlib.rcParams['ytick.major.width'] = 1
matplotlib.rcParams['xtick.major.size'] = 3
matplotlib.rcParams['ytick.major.size'] = 3
matplotlib.rcParams['xtick.minor.size'] = 2
matplotlib.rcParams['ytick.minor.size'] = 2
matplotlib.rcParams['font.size'] = 11
matplotlib.rcParams['axes.titlesize'] = 11
matplotlib.rcParams['axes.labelsize'] = 12
matplotlib.rcParams['legend.fontsize'] = 10
matplotlib.rcParams['xtick.labelsize'] = 10
matplotlib.rcParams['ytick.labelsize'] = 10
sns_styleset()
TARGETS = [
'gpm',
'kda'
]
target_rmp = {
'gpm': 'GPM',
'kda': 'KDA'
}
TYPE = 'non_smoothed'
features_embedding = np.load(F'results\\arrays\\embedding_feat_{TYPE}.npy')
viz_embedding = np.load(F'results\\arrays\\embedding_viz_{TYPE}.npy')
df = pd.read_csv(f'data\\df_{TYPE}.csv')
df = df.sort_values(['account_id', 'nth_match', 'date'])
df['global'] = 1
df['rating'] = (df['rating'] - df['rating'].min()) / (df['rating'].max() - df['rating'].min())
df.head()
accounts_to_retain = create_filters(
df=df,
targets=TARGETS,
cal_period=10,
n_bins=100, # we divide median perfromance over first cal_period matches in 100 bins
to_retain_bins=[i for i in range(40, 60)] # we retain account in bin 40 to 60
)
curves_visualizer(
df=df,
grouper='position',
targets=TARGETS,
rows=1,
columns=2,
target_rmp=target_rmp,
grouper_rmp={'position': 'Position'},
cmap='tab10',
legend=True,
accounts_to_retain=accounts_to_retain #add dictionary of accounts to retain if you want to filter out the others
)
curves_visualizer(
df=df,
grouper='region',
targets=TARGETS,
rows=1,
columns=2,
target_rmp=target_rmp,
grouper_rmp={'region': 'Region'},
cmap='tab10',
legend=True,
accounts_to_retain=accounts_to_retain #add dictionary of accounts to retain if you want to filter out the others
)
profiles_visualizer(
df=df,
target='time_gap',
dim_reduction=viz_embedding,
grouper='global',
grouper_rmp={'global': 'Global'},
target_rmp={'time_gap': 'TIME GAP'},
cmap='tab10',
legend=False
)
curves_visualizer(
df=df,
grouper='global',
targets=TARGETS,
rows=1,
columns=2,
target_rmp=target_rmp,
grouper_rmp={'global': 'Global'},
cmap='tab10',
accounts_to_retain=accounts_to_retain
)
rule_labels = rule_based_clustering(df, feature='nth_day')
df['rule_clusters'] = df['account_id'].map(
{acc_id: label for acc_id, label in zip(df['account_id'].unique(), rule_labels)}
)
profiles_visualizer(
df=df,
target='time_gap',
dim_reduction=viz_embedding,
grouper='rule_clusters',
grouper_rmp={'rule_clusters': 'Stafford Spacing'},
target_rmp={'time_gap': 'TIME GAP'},
cmap='tab10',
legend=True
)
curves_visualizer(
df=df[df['rule_clusters'] != -1].sort_values('rule_clusters'),
grouper='rule_clusters',
targets=TARGETS,
rows=1,
columns=2,
target_rmp=target_rmp,
grouper_rmp={'rule_clusters': 'Stafford Spacing'},
cmap='tab10',
legend=True,
accounts_to_retain=accounts_to_retain #add dictionary of accounts to retain if you want to filter out the others
)
hdb_clust = HDBSCAN(
min_cluster_size=4000,
min_samples=200
)
hdb_clust.fit(features_embedding)
df['hdb_clusters'] = df['account_id'].map(
{acc_id: label for acc_id, label in zip(df['account_id'].unique(), hdb_clust.labels_)}
)
profiles_visualizer(
df=df,
target='time_gap',
dim_reduction=viz_embedding,
grouper='hdb_clusters',
grouper_rmp={'hdb_clusters': 'HDBSCAN Spacing'},
target_rmp={'time_gap': 'TIME GAP'},
cmap='tab10',
legend=True
)
curves_visualizer(
df=df[df['hdb_clusters'] != -1].sort_values('hdb_clusters'),
grouper='hdb_clusters',
targets=TARGETS,
rows=1,
columns=2,
target_rmp=target_rmp,
grouper_rmp={'hdb_clusters': 'HDBSCAN Spacing'},
cmap='tab10',
accounts_to_retain=accounts_to_retain #add dictionary of accounts to retain if you want to filter out the others
)
df = df.sort_values(['account_id', 'nth_match'])
unique_ids = len(df['account_id'].unique())
X = df['time_gap'].values
X = X.reshape((unique_ids, 100))
X = X[:, 1:-5]
X = tsmv().fit_transform(X)
km_clust = auto_k_means(
X=X,
min_k=2,
max_k=10,
save_path='results\\figures\\k_means',
max_iter=300,
n_init=2
)
df['km_clusters'] = df['account_id'].map(
{acc_id: label for acc_id, label in zip(df['account_id'].unique(), km_clust.labels_)}
)
profiles_visualizer(
df=df,
target='time_gap',
dim_reduction=viz_embedding,
grouper='km_clusters',
grouper_rmp={'km_clusters': 'K-Means Spacing'},
target_rmp={'time_gap': 'TIME GAP'},
cmap='tab10',
legend=True
)
curves_visualizer(
df=df.sort_values('km_clusters'),
grouper='km_clusters',
targets=TARGETS,
rows=1,
columns=2,
target_rmp=target_rmp,
grouper_rmp={'km_clusters': 'K-Means Spacing'},
cmap='tab10',
accounts_to_retain=accounts_to_retain #add dictionary of accounts to retain if you want to filter out the others
)
sequence_autoencoder = RecurrentAutoEncoder(
X=X,
noise=1,
units=60,
latent_space=30,
loss='mae',
optimizer='adam',
output_activation='linear'
)
stopper = EarlyStopping(
monitor='val_loss',
patience=15,
min_delta=0.0001,
restore_best_weights=True
)
sequence_autoencoder.fit(
X,
epochs=1000,
verbose=1,
batch_size=512,
callbacks=[stopper],
validation_split=0.2
)
embedding_feature = sequence_autoencoder.encode(X)
rnn_clust = auto_k_means(
embedding_feature,
min_k=2,
max_k=11,
save_path='results\\figures\\autoenc',
max_iter=3000,
n_init=2000,
batch_size=512
)
df['rnn_clusters'] = df['account_id'].map(
{acc_id: label for acc_id, label in zip(df['account_id'].unique(), rnn_clust.labels_)}
)
profiles_visualizer(
df=df,
target='time_gap',
dim_reduction=viz_embedding,
grouper='rnn_clusters',
grouper_rmp={'rnn_clusters': 'AutEnc Spacing'},
target_rmp={'time_gap': 'TIME GAP'},
cmap='tab10',
legend=True
)
curves_visualizer(
df=df.sort_values('rnn_clusters'),
grouper='rnn_clusters',
targets=TARGETS,
rows=1,
columns=2,
target_rmp=target_rmp,
grouper_rmp={'rnn_clusters': 'AutEnc Spacing'},
cmap='tab10',
accounts_to_retain=accounts_to_retain #add dictionary of accounts to retain if you want to filter out the others
)
visualize_cluster_overlap(
df[df['rule_clusters'] != -1],
'rule_clusters',
'km_clusters',
groups_rmp={
'rule_clusters': 'Stafford Spacing',
'km_clusters': 'K-Means Spacing'
}
)
visualize_cluster_overlap(
df[df['rule_clusters'] != -1],
'rule_clusters',
'rnn_clusters',
groups_rmp={
'rule_clusters': 'Stafford Spacing',
'rnn_clusters': 'AutEnc Spacing'
}
)
visualize_cluster_overlap(
df[df['rule_clusters'] != -1],
'km_clusters',
'rnn_clusters',
groups_rmp={
'km_clusters': 'K-Means Spacing',
'rnn_clusters': 'AutEnc Spacing'
}
)
visualize_cluster_overlap(
df[df['hdb_clusters'] != -1],
'hdb_clusters',
'km_clusters',
groups_rmp={
'hdb_clusters': 'HDBSCAN Spacing',
'km_clusters': 'K-Means Spacing'
}
)
visualize_cluster_overlap(
df[df['hdb_clusters'] != -1],
'hdb_clusters',
'rnn_clusters',
groups_rmp={
'hdb_clusters': 'HDBSCAN Spacing',
'rnn_clusters': 'AutEnc Spacing'
}
)
visualize_cluster_overlap(
df[(df['hdb_clusters'] != -1) & (df['rule_clusters'] != -1)],
'hdb_clusters',
'rule_clusters',
groups_rmp={
'hdb_clusters': 'HDBSCAN Spacing',
'rule_clusters': 'Stafford Spacing'
}
)
df.to_csv('data\\df_non_smoothed.csv', index=False)